library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)raw <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/combined_occ_img_downloaded.csv") %>%
dplyr::select(day, month, year, startDayOfYear, coordinateUncertaintyInMeters, decimalLongitude, decimalLatitude, filename_image, species, genus,family)
phenology <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/phenology.csv") %>%
mutate(filename_image = gsub(".txt", "", file_name))
joint_data <- left_join(phenology,raw, by = "filename_image")joint_data_flower <- joint_data %>% #116805
filter(flower_one > 0 & flower_many > 0) %>%
filter(!is.na(startDayOfYear)) %>%
filter(year>=1895) %>%
dplyr::select(decimalLongitude, decimalLatitude, startDayOfYear, year, species, genus, family) %>%
# delete family Pinaceae and Cupressaceae
filter(family!="Pinaceae" & family!="Cupressaceae") %>%
rename(lon = decimalLongitude, lat = decimalLatitude, doy = startDayOfYear) %>%
distinct() # clear herbarium data for repeat file and repeat file with different phenology
# two repeated component:
# 1. completely the same ~300
# 2. same specimen (different name) with different phenology (as long as the flower is consistent, we will keep them) ~2000library(raster)
# extract the climate normality
complete_period_raster <- raster("../data/prism/complete_period_springmean.tif")
joint_data_flower_normality <- joint_data_flower %>%
dplyr::select(lat, lon) %>%
distinct() %>%
mutate(complete_period_temp = extract(complete_period_raster, cbind(lon, lat)))
# extract the climate anormality
# Initialize an empty data frame to store the results
joint_data_flower_anormality <- data.frame()
# Loop through the specified years
for (fo_year in 1895:2023) {
# Load the yearly raster file
yearly_raster <- raster(paste0("../data/prism/", fo_year, "_springmean.tif"))
# Process the joint_data_flower for the current year
yearly_data <- joint_data_flower %>%
dplyr::select(year, lat, lon) %>%
distinct() %>%
filter(year == fo_year) %>%
mutate(yearly_temp = extract(yearly_raster, cbind(lon, lat)))
# Append the yearly data to the cumulative data frame
joint_data_flower_anormality <- rbind(joint_data_flower_anormality, yearly_data)
}
# Combine the normality and anormality data
temperature_data <- joint_data_flower_normality %>%
right_join(joint_data_flower_anormality, by = c("lat", "lon")) %>%
rename(norm = complete_period_temp, yeart = yearly_temp) %>%
mutate(anom = yeart - norm) %>%
right_join(joint_data_flower, by = c("lat", "lon", "year")) %>%
filter(!is.na(anom))
write.csv(temperature_data, "../data/herb_temperature_data.csv")library(mvoutlier)Loading required package: sgeostat
temperature_data <- read.csv("../data/herb_temperature_data.csv")
temperature_data_clean <- temperature_data %>%
group_by(species) %>%
filter(n() > 30) %>%
group_modify(~ {
data_matrix <- dplyr::select(.x, yeart, doy) %>% as.matrix()
outlier_result <- aq.plot(data_matrix)
.x %>% mutate(outliers = outlier_result$outliers)
}) %>%
ungroup()temperature_data_clean %>%
group_by(species) %>%
summarise(total = n(),
num_outliers = sum(outliers),
delete_p = num_outliers/total) %>%
arrange(desc(delete_p))# A tibble: 74 × 4
species total num_outliers delete_p
<chr> <int> <int> <dbl>
1 Quercus sadleriana 61 20 0.328
2 Betula lenta 188 58 0.309
3 Acer tataricum 59 17 0.288
4 Betula papyrifera 190 54 0.284
5 Fraxinus americana 181 50 0.276
6 Betula alleghaniensis 145 35 0.241
7 Betula populifolia 108 24 0.222
8 Quercus durata 61 13 0.213
9 Quercus john-tuckeri 38 8 0.211
10 Quercus stellata 87 18 0.207
# ℹ 64 more rows
temperature_data_model <- temperature_data_clean %>%
filter(!outliers) %>% # Correctly referencing the outliers column
group_by(species) %>%
filter(n_distinct(doy, anom, norm) > 30) %>% # Use n_distinct() for distinct counting
ungroup()source("../scripts/function_visionalize_summmary_MLmodel.R")
# Apply the function to each species and store the results
results <- list()
unique_species <- unique(temperature_data_model$species)
for (species_name in unique_species) {
results[[species_name]] <- analyze_species(temperature_data_model, species_name)
}
# Combine all summary rows into a single data frame
summary_results <- bind_rows(lapply(results, function(res) res$summary))
# Save all plots to a single PDF file
pdf("../data/species_plots_herb.pdf", width = 8, height = 6)
for (species_name in unique_species) {
print(results[[species_name]]$plot)
}
dev.off()
write.csv(summary_results, "../species_summary_herb.csv", row.names = FALSE)library(data.tree)
# Example data
species_data <- temperature_data_model %>%
group_by(species, genus, family) %>%
summarise(specimen_count = n())`summarise()` has grouped output by 'species', 'genus'. You can override using
the `.groups` argument.
# Aggregate data at genus level
genus_data <- species_data %>%
group_by(family, genus) %>%
summarise(specimen_count = sum(specimen_count))`summarise()` has grouped output by 'family'. You can override using the
`.groups` argument.
# Aggregate data at family level
family_data <- species_data %>%
group_by(family) %>%
summarise(specimen_count = sum(specimen_count))
# Total specimen count
total_specimen_count <- sum(species_data$specimen_count)
# Create hierarchical data
hierarchical_data <- species_data %>%
mutate(level = "species") %>%
bind_rows(genus_data %>% mutate(species = "", level = "genus")) %>%
bind_rows(family_data %>% mutate(genus = "", species = "", level = "family")) %>%
ungroup() %>%
arrange(family, genus, species)
# Create a data tree structure
tree_data <- hierarchical_data %>%
mutate(pathString = paste("Total", family, genus, species, sep = "/")) %>%
dplyr::select(pathString, specimen_count)
tree <- as.Node(tree_data)
tree$specimen_count <- total_specimen_count
# Print the tree structure to console
print(tree, "specimen_count") levelName specimen_count
1 Total 7338
2 ¦--Betulaceae 1027
3 ¦ °--Betula 1027
4 ¦ ¦--Betula alleghaniensis 110
5 ¦ ¦--Betula glandulosa 64
6 ¦ ¦--Betula lenta 130
7 ¦ ¦--Betula nigra 95
8 ¦ ¦--Betula occidentalis 162
9 ¦ ¦--Betula papyrifera 136
10 ¦ ¦--Betula populifolia 84
11 ¦ ¦--Betula pumila 207
12 ¦ °--Betula sandbergii 39
13 ¦--Fagaceae 1845
14 ¦ °--Quercus 1845
15 ¦ ¦--Quercus agrifolia 207
16 ¦ ¦--Quercus alba 75
17 ¦ ¦--Quercus berberidifolia 69
18 ¦ ¦--Quercus chrysolepis 201
19 ¦ ¦--Quercus durata 48
20 ¦ ¦--Quercus ellipsoidalis 34
21 ¦ ¦--Quercus falcata 41
22 ¦ ¦--Quercus gambelii 120
23 ¦ ¦--Quercus garryana 50
24 ¦ ¦--Quercus ilicifolia 71
25 ¦ ¦--Quercus kelloggii 64
26 ¦ ¦--Quercus lobata 37
27 ¦ ¦--Quercus macrocarpa 81
28 ¦ ¦--Quercus marilandica 67
29 ¦ ¦--Quercus nigra 45
30 ¦ ¦--Quercus parvula 37
31 ¦ ¦--Quercus prinoides 33
32 ¦ ¦--Quercus rubra 55
33 ¦ ¦--Quercus sadleriana 41
34 ¦ ¦--Quercus stellata 69
35 ¦ ¦--Quercus turbinella 58
36 ¦ ¦--Quercus vacciniifolia 63
37 ¦ ¦--Quercus velutina 52
38 ¦ ¦--Quercus virginiana 46
39 ¦ °--Quercus wislizeni 181
40 ¦--Moraceae 248
41 ¦ °--Morus 248
42 ¦ ¦--Morus alba 111
43 ¦ ¦--Morus microphylla 45
44 ¦ °--Morus rubra 92
45 ¦--Oleaceae 775
46 ¦ °--Fraxinus 775
47 ¦ ¦--Fraxinus americana 131
48 ¦ ¦--Fraxinus anomala 123
49 ¦ ¦--Fraxinus cuspidata 55
50 ¦ ¦--Fraxinus dipetala 113
51 ¦ ¦--Fraxinus pennsylvanica 240
52 ¦ °--Fraxinus velutina 113
53 ¦--Salicaceae 807
54 ¦ °--Populus 807
55 ¦ ¦--Populus angustifolia 75
56 ¦ ¦--Populus balsamifera 35
57 ¦ ¦--Populus deltoides 237
58 ¦ ¦--Populus fremontii 126
59 ¦ ¦--Populus grandidentata 101
60 ¦ °--Populus tremuloides 233
61 ¦--Sapindaceae 2086
62 ¦ °--Acer 2086
63 ¦ ¦--Acer circinatum 45
64 ¦ ¦--Acer glabrum 172
65 ¦ ¦--Acer macrophyllum 225
66 ¦ ¦--Acer negundo 464
67 ¦ ¦--Acer pensylvanicum 181
68 ¦ ¦--Acer platanoides 106
69 ¦ ¦--Acer rubrum 465
70 ¦ ¦--Acer saccharinum 89
71 ¦ ¦--Acer saccharum 130
72 ¦ ¦--Acer spicatum 167
73 ¦ °--Acer tataricum 42
74 °--Ulmaceae 550
75 °--Ulmus 550
76 ¦--Ulmus alata 99
77 ¦--Ulmus americana 265
78 ¦--Ulmus pumila 49
79 °--Ulmus rubra 137
summary_results <- read.csv("../species_summary_herb.csv")
# plot a figure to show the comparasion of anom slope and norm lope with confidence interval and model fitting
ggplot(summary_results, aes(x = anom_estimate, y = norm_estimate)) +
geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
geom_point(aes(alpha = r_squared), size = 3) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") + # Add 1:1 line
scale_alpha_continuous(range = c(0.2, 1)) + # Adjust alpha range for better visualization
labs(
title = "Slope Estimates for Anom and Norm by Species",
x = "Anom Slope Estimate",
y = "Norm Slope Estimate",
alpha = "R^2"
) +
theme_minimal() +
theme(legend.position = "right")ggplot(summary_results %>% filter(r_squared>0.3), aes(x = anom_estimate, y = norm_estimate)) +
geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
geom_point(aes(alpha = r_squared), size = 3) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") + # Add 1:1 line
scale_alpha_continuous(range = c(0.2, 1)) + # Adjust alpha range for better visualization
labs(
title = "Slope Estimates for Anom and Norm by Species",
x = "Anom Slope Estimate",
y = "Norm Slope Estimate",
alpha = "R^2"
) +
theme_minimal() +
theme(legend.position = "right")to_check <- summary_results %>%
#filter for the species that the anom slope and norm slope confidence interval do not overlap
filter(anom_conf_low > norm_conf_high | anom_conf_high < norm_conf_low) %>%
arrange(desc(r_squared))
print(to_check) species anom_estimate anom_conf_low anom_conf_high norm_estimate
1 Quercus nigra 2.43779652 -1.0419418 5.917535 -4.903699
2 Betula nigra -0.01562399 -3.5261793 3.494931 -4.629899
3 Ulmus alata 1.72035976 -2.3772538 5.817973 -5.869936
4 Betula glandulosa 5.54789354 0.2870689 10.808718 -6.449175
norm_conf_low norm_conf_high r_squared
1 -6.232752 -3.574645 0.5839906
2 -5.657750 -3.602047 0.4653512
3 -8.106634 -3.633238 0.2572838
4 -9.420004 -3.478347 0.2510565
to_check_species <- temperature_data %>%
filter(species %in% to_check$species)
result <- to_check_species %>%
group_by(species) %>%
do(broom::tidy(lm(doy ~ anom + norm, data = .), conf.int = TRUE)) %>%
ungroup()
# Count the number of unique species
num_tests <- temperature_data %>%
pull(species) %>%
unique() %>%
length()
# Bonferroni-adjusted alpha
alpha <- 0.05
adjusted_alpha <- alpha / num_tests
# Calculate the Bonferroni-adjusted critical value for the t-distribution
# Assuming degrees of freedom is the same for each model; you can adjust this part if needed
df <- to_check_species %>%
group_by(species) %>%
summarize(df = lm(doy ~ anom + norm, data = .)$df.residual) %>%
pull(df) %>%
unique() %>%
mean()
# Get the critical t-value
critical_value <- qt(1 - adjusted_alpha / 2, df)
# Adjust the confidence intervals
result <- result %>%
mutate(
conf.low.adjusted = estimate - critical_value * std.error,
conf.high.adjusted = estimate + critical_value * std.error
) %>%
dplyr::select(species, term, estimate, conf.low.adjusted, conf.high.adjusted) %>%
filter(term %in% c("anom", "norm")) %>%
tidyr::pivot_wider(names_from = term, values_from = c(estimate, conf.low.adjusted, conf.high.adjusted))
result %>%
filter(conf.low.adjusted_anom > conf.high.adjusted_norm | conf.high.adjusted_anom < conf.low.adjusted_norm) # A tibble: 0 × 7
# ℹ 7 variables: species <chr>, estimate_anom <dbl>, estimate_norm <dbl>,
# conf.low.adjusted_anom <dbl>, conf.low.adjusted_norm <dbl>,
# conf.high.adjusted_anom <dbl>, conf.high.adjusted_norm <dbl>